{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "%run round2_base.ipynb\n",
    "import lightgbm as lgb\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "def drop_features(df, feature):\n",
    "    drop_cols = [col for col in feature.columns if col not in ['buyer_admin_id', 'item_id', 'cate_id', 'store_id']]\n",
    "    print('> drop features number is ', len(drop_cols))\n",
    "    return df.drop(drop_cols, 1)\n",
    "\n",
    "def train_check(df):\n",
    "    dupli_columns = df.columns.duplicated()\n",
    "    if sum(dupli_columns) > 0:\n",
    "        raise ValueError('columns:{} is duplicated'.format(' ,'.join(df.columns[dupli_columns])))\n",
    "\n",
    "def train_split(df, label='irank'):\n",
    "    train_check(df)\n",
    "    df['country_id'] = df['country_id'].map({'xx':0, 'yy':1, 'zz':2}).astype('category')\n",
    "    df['cate_id'] = df['cate_id'].astype('category')\n",
    "    df['store_id'] = df['store_id'].astype('category')\n",
    "    train_df = df[df['is_train'] == 1].reset_index(drop=True)\n",
    "    test_df = df[(df['is_train'] == 0) & (df['data_type']=='base')].reset_index(drop=True)\n",
    "    print('> train_df sample:', len(train_df))\n",
    "    print('>> positive-1 sample:', len(train_df[train_df[label]==1]))\n",
    "    print('>> negtive-1 sample:', len(train_df[train_df[label]==0]))\n",
    "    print('>> 0/1 sample:', len(train_df[train_df[label]==0]) / len(train_df[train_df[label]==1]))\n",
    "    print('> features number is:', len(train_df.columns))\n",
    "    print('> test_df sample:', len(test_df),'\\n')\n",
    "    return train_df, test_df\n",
    "\n",
    "def get_lgb_params():\n",
    "    learning_rate = 0.1\n",
    "    objective = 'binary'\n",
    "    lgb_params = {\n",
    "        'num_leaves': 127, #31\n",
    "        'min_data_in_leaf': 15, # 30 \n",
    "        'objective':objective,\n",
    "        'max_depth': -1,\n",
    "        'learning_rate': learning_rate,\n",
    "        \"min_child_samples\": 15,\n",
    "        \"boosting\": \"gbdt\",\n",
    "        \"feature_fraction\": 0.8,\n",
    "        \"bagging_freq\": 1,\n",
    "        \"bagging_fraction\": 0.9 ,\n",
    "        \"bagging_seed\": 11,\n",
    "        \"metric\": 'auc',\n",
    "        \"lambda_l1\": 0.1,\n",
    "        \"verbosity\": -1,\n",
    "        \"nthread\": 23,\n",
    "        \"random_state\": 4590,\n",
    "         }\n",
    "    return lgb_params\n",
    "\n",
    "def train_evaluation(df):\n",
    "    # label\n",
    "    label = pd.read_hdf('../data/label.h5', '1.0')\n",
    "    label = label[label['buyer_admin_id'].isin(df['buyer_admin_id'])]\n",
    "    \n",
    "    user_num = len(df)\n",
    "    df = df.sort_values(by=['buyer_admin_id', 'irank'], ascending=False)\n",
    "    df['irank'] = df.groupby(['buyer_admin_id']).cumcount() + 1    \n",
    "    df = pd.merge(df, label, how='inner', on=['buyer_admin_id', 'item_id'])\n",
    "    \n",
    "    score_df = pd.DataFrame(df['irank'].value_counts().head(10)/user_num)\n",
    "    score_df['cum_irank'] = score_df['irank'].cumsum()\n",
    "    print(score_df.head(20))\n",
    "    \n",
    "    score = sum(1 / score_df['irank']) / user_num\n",
    "    print('evaluation score: ',score)\n",
    "    return score_df, score\n",
    "\n",
    "def train_user_set(df, n=5):\n",
    "    user_set = df[(df['country_id']!=0)  & (df['data_type']=='base')][['buyer_admin_id']].drop_duplicates()\n",
    "    user_set = user_set.sample(frac=1, random_state=2020)['buyer_admin_id'].unique()\n",
    "    user_set = [user_set[i::n] for i in range(n)]\n",
    "    return user_set\n",
    "    \n",
    "\n",
    "def train_kfold_lgb(features, if_online=None, label='irank', exclude_columns=None, categorical_columns=None, return_valid=False):\n",
    "    train_df, test_df = train_split(features)\n",
    "    features_columns = [f for f in train_df.columns if f not in exclude_columns]\n",
    "    \n",
    "    # Create arrays and dataframes to store results\n",
    "    label_df = train_df[(train_df['irank']==1) & (train_df['data_type']=='base')][['buyer_admin_id', 'item_id']]\n",
    "    oof_preds = train_df[['buyer_admin_id', 'item_id']]\n",
    "    sub_preds = np.zeros(test_df.shape[0])\n",
    "    mean_score = 0\n",
    "    mean_baseline_score = 0\n",
    "    user_set = train_user_set(train_df, n=5)\n",
    "\n",
    "    for n_fold, valid_user in enumerate(user_set):\n",
    "        train_idx = train_df[~train_df['buyer_admin_id'].isin(valid_user)] # add slide user\n",
    "        valid_idx = train_df[(train_df['buyer_admin_id'].isin(valid_user)) & (train_df['data_type']=='base')]\n",
    "        \n",
    "        xgtrain = lgb.Dataset(train_idx[features_columns], label=train_idx[label].values, categorical_feature = categorical_columns)\n",
    "        del train_idx; gc.collect()\n",
    "        xgvalid = lgb.Dataset(valid_idx[features_columns], label=valid_idx[label].values, categorical_feature = categorical_columns)\n",
    "        lgb_params = get_lgb_params()\n",
    "        clf = lgb.train(lgb_params,\n",
    "                         xgtrain,\n",
    "                         valid_sets=[xgtrain,xgvalid],\n",
    "                         valid_names=['train','valid'],\n",
    "                         num_boost_round=10000,\n",
    "                         early_stopping_rounds=100, \n",
    "                         verbose_eval = 500,\n",
    "                         )\n",
    "        \n",
    "        # 结果处理\n",
    "        user_num = len(valid_user)\n",
    "        valid_idx['proba'] = clf.predict(valid_idx[features_columns], num_iteration=clf.best_iteration)\n",
    "        \n",
    "        try:\n",
    "            valid_idx.sort_values(by=['buyer_admin_id', 'baseline'], ascending=[1, 1], inplace=True)\n",
    "        except:\n",
    "            valid_idx.sort_values(by=['buyer_admin_id', 'user-item_without_repeat_irank_without_repeat_MIN'], ascending=[1, 1], inplace=True)\n",
    "        \n",
    "        valid_idx['baseline_rank'] = valid_idx.groupby(['buyer_admin_id']).cumcount() + 1\n",
    "        \n",
    "        valid_idx.sort_values(by=['buyer_admin_id', 'proba'], ascending=[1, 0], inplace=True)\n",
    "        valid_idx['proba_rank'] = valid_idx.groupby(['buyer_admin_id']).cumcount() + 1\n",
    "        \n",
    "        \n",
    "        score_df = pd.merge(label_df, \n",
    "                            valid_idx[['buyer_admin_id', 'item_id', 'baseline_rank', 'proba_rank', 'proba']], \n",
    "                            how='inner', on=['buyer_admin_id', 'item_id'])\n",
    "        \n",
    "        proba_score = sum(1 / score_df['proba_rank']) / user_num\n",
    "        baseline_score = sum(1 / score_df['baseline_rank']) / user_num\n",
    "        \n",
    "        mean_score += proba_score\n",
    "        mean_baseline_score += baseline_score\n",
    "        \n",
    "        score_ratio_df = pd.concat([score_df['proba_rank'].value_counts(),\n",
    "                                   score_df['baseline_rank'].value_counts()], axis=1) / user_num\n",
    "        \n",
    "        print(score_ratio_df.head(5))\n",
    "        print('evaluation baseline score: ',baseline_score)\n",
    "        print('evaluation proba score: ', proba_score)\n",
    "        \n",
    "        \n",
    "        if if_online:\n",
    "            sub_preds += clf.predict(test_df[features_columns], num_iteration=clf.best_iteration) # folds.n_splits\n",
    "    \n",
    "    print('-'*40, '\\n', 'mean evaluation baseline score: ', mean_baseline_score/ (n_fold + 1))\n",
    "    print('mean evaluation proba score: ',mean_score/ (n_fold + 1), '\\n')\n",
    "    \n",
    "    if return_valid:\n",
    "        return valid_idx\n",
    "    \n",
    "    if if_online:\n",
    "        test_df[label] = sub_preds / (n_fold + 1)\n",
    "        submit = test_df[['buyer_admin_id', 'item_id', 'irank']]\n",
    "        submit = submit.sort_values(by=['buyer_admin_id', 'irank'], ascending=[1,0]).reset_index(drop=True)\n",
    "        submit['irank'] = submit.groupby(['buyer_admin_id']).cumcount() + 1\n",
    "        return submit, mean_score\n",
    "    else:\n",
    "        return score_df, mean_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_features(data_type):\n",
    "    if data_type == 'base':\n",
    "        path = '../feature/rank/'\n",
    "    elif data_type == 'slide':\n",
    "        path = '../feature/slide/'\n",
    "    \n",
    "    print('> Now dtype is :', data_type)\n",
    "    features = get_sample(dtype='rank', data_type=data_type)\n",
    "    features['data_type'] = data_type\n",
    "\n",
    "    path_tuple = [\n",
    "        ('user_item_dedup_feature', 'asc'),\n",
    "        ('user_item_dedup_feature', 'cate_id_desc'),\n",
    "        ('user_item_dedup_feature', 'store_id_desc'),\n",
    "        ('user_item_dupli_feature', 'all'),\n",
    "        ('user_second_diff_feature', 'item_id'),\n",
    "        ('user_second_diff_feature', 'cate_id'),\n",
    "        ('user_second_diff_feature', 'store_id'),\n",
    "        ('user_cate_lastday_dedup_feature', 'all'),\n",
    "        ('user_item_lastday_dedup_feature', 'all'),\n",
    "        ('user_store_lastday_dedup_feature', 'all'),\n",
    "        ('user_cate_dedup_feature', 'all'),\n",
    "        ('user_item_dedup_feature', 'all'),\n",
    "        ('user_store_dedup_feature', 'all'),\n",
    "        ('user_item_rank_diff_feature', 'all'),\n",
    "        ('user_cate_dedup_feature', 'asc'),\n",
    "        ('user_store_dedup_feature', 'asc'),\n",
    "        ('user_item_lastday_dedup_feature', 'desc'),\n",
    "        ('user_item_lastday_dedup_feature', 'asc'),\n",
    "        ('item_conv_feature', 'all'),\n",
    "        ('user_cate_feature', 'buy'),\n",
    "    ]\n",
    "\n",
    "    for file in path_tuple:\n",
    "        print(file)\n",
    "        feature = pd.read_hdf(path + file[0], file[1])\n",
    "        if file[0] == 'item_conv_feature':\n",
    "            key = ['item_id']\n",
    "        elif 'item_id' in feature.columns:\n",
    "            key = ['buyer_admin_id', 'item_id']\n",
    "            feature = feature.drop([col for col in feature.columns if col in ['cate_id','store_id']], 1)\n",
    "        else:\n",
    "            key = [col for col in feature.columns if col in ['buyer_admin_id', 'cate_id', 'store_id']]\n",
    "        features = features.merge(feature, on=key, how='left')\n",
    "        \n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "> Now dtype is : base\n",
      "('user_item_dedup_feature', 'asc')\n",
      "('user_item_dedup_feature', 'cate_id_desc')\n",
      "('user_item_dedup_feature', 'store_id_desc')\n",
      "('user_item_dupli_feature', 'all')\n",
      "('user_second_diff_feature', 'item_id')\n",
      "('user_second_diff_feature', 'cate_id')\n",
      "('user_second_diff_feature', 'store_id')\n",
      "('user_cate_lastday_dedup_feature', 'all')\n",
      "('user_item_lastday_dedup_feature', 'all')\n",
      "('user_store_lastday_dedup_feature', 'all')\n",
      "('user_cate_dedup_feature', 'all')\n",
      "('user_item_dedup_feature', 'all')\n",
      "('user_store_dedup_feature', 'all')\n",
      "('user_item_rank_diff_feature', 'all')\n",
      "('user_cate_dedup_feature', 'asc')\n",
      "('user_store_dedup_feature', 'asc')\n",
      "('user_item_lastday_dedup_feature', 'desc')\n",
      "('user_item_lastday_dedup_feature', 'asc')\n",
      "('item_conv_feature', 'all')\n",
      "('user_cate_feature', 'buy')\n"
     ]
    }
   ],
   "source": [
    "base_features = get_features('base')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "> train_df sample: 1378665\n",
      ">> positive-1 sample: 418821\n",
      ">> negtive-1 sample: 959844\n",
      ">> 0/1 sample: 2.291776200333794\n",
      "> features number is: 505\n",
      "> test_df sample: 31777 \n",
      "\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[54]\ttrain's auc: 0.966218\tvalid's auc: 0.948011\n",
      "   proba_rank  baseline_rank\n",
      "1    0.816651       0.790923\n",
      "2    0.107899       0.110718\n",
      "3    0.038448       0.040905\n",
      "4    0.016767       0.022331\n",
      "5    0.006866       0.011708\n",
      "evaluation baseline score:  0.8708763677228938\n",
      "evaluation proba score:  0.8907178695339567\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[76]\ttrain's auc: 0.970021\tvalid's auc: 0.95105\n",
      "   proba_rank  baseline_rank\n",
      "1    0.820481       0.797933\n",
      "2    0.109489       0.111440\n",
      "3    0.034401       0.040616\n",
      "4    0.015755       0.019368\n",
      "5    0.008311       0.010985\n",
      "evaluation baseline score:  0.876685270006407\n",
      "evaluation proba score:  0.893745738117942\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[77]\ttrain's auc: 0.970182\tvalid's auc: 0.949557\n",
      "   proba_rank  baseline_rank\n",
      "1    0.816710       0.789029\n",
      "2    0.110437       0.110726\n",
      "3    0.037583       0.044738\n",
      "4    0.013443       0.020888\n",
      "5    0.007300       0.012359\n",
      "evaluation baseline score:  0.8698717483433677\n",
      "evaluation proba score:  0.8911473624261906\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[57]\ttrain's auc: 0.966843\tvalid's auc: 0.948763\n",
      "   proba_rank  baseline_rank\n",
      "1    0.818156       0.791920\n",
      "2    0.109642       0.110798\n",
      "3    0.033463       0.043510\n",
      "4    0.017202       0.020454\n",
      "5    0.008890       0.012215\n",
      "evaluation baseline score:  0.8720901307794503\n",
      "evaluation proba score:  0.8918336183377645\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[55]\ttrain's auc: 0.966434\tvalid's auc: 0.950489\n",
      "   proba_rank  baseline_rank\n",
      "1    0.821191       0.798497\n",
      "2    0.107762       0.107329\n",
      "3    0.035343       0.042209\n",
      "4    0.015250       0.019948\n",
      "5    0.009251       0.009829\n",
      "evaluation baseline score:  0.875995340221578\n",
      "evaluation proba score:  0.8939275629114776\n",
      "---------------------------------------- \n",
      " mean evaluation baseline score:  0.8731037714147393\n",
      "mean evaluation proba score:  0.8922744302654664 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "submit, score = train_kfold_lgb(base_features.copy(),\n",
    "                                if_online=True,\n",
    "                                label='irank',\n",
    "                                exclude_columns=['buyer_admin_id', 'item_id', 'irank', 'is_train', 'baseline', 'data_type'], \n",
    "                                categorical_columns=['cate_id', 'store_id', 'country_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def get_submitfile():\n",
    "# 品类填充\n",
    "recall = pd.read_hdf('../output/recall_by_next_5_item', 'all')\n",
    "submit_recall = pd.concat([submit1, recall])\n",
    "submit_recall['irank'] = submit_recall.groupby(['buyer_admin_id']).cumcount()+1\n",
    "submit_file  = submit_transform(submit_recall, '0907-01.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python3 (PySpark)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
